Module Preservation | Approach: MF vs Mostafavi

Pairwise comparisons

MF and Mostafavi

# Input: expression matrix and modules
########## Dataset 01 
dir_expr_01 = "/pastel/projects/speakeasy_dlpfc/"
dir_mod_01 = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_net_MF/"

# Residuals of the expression: 
load(paste0(dir_expr_01, "exprdata_byregion.Rdata")) 
res_dataset_01 = exprData_MF # Residuals of the expression
res_dataset_01 = as.data.frame(t(res_dataset_01))
# Remove ENS version
colnames(res_dataset_01) = gsub("(.*)\\.(.*)", "\\1", colnames(res_dataset_01))
rownames(res_dataset_01) = gsub("(.*)_(.*)", "\\2", rownames(res_dataset_01))

# clusters from SpeakEasy: 
k_dataset_01 = read.table(paste0(dir_mod_01, "geneBycluster.txt"), header = T, stringsAsFactors = F) 
k_dataset_01$ensembl = gsub("(.*)\\.(.*)", "\\1", k_dataset_01$ensembl)

# Checking order
# all(k_dataset_01$ensembl == colnames(res_dataset_01))
modules_dataset_01 = setNames(k_dataset_01$cluster_lv3, k_dataset_01$ensembl)
# all(names(modules_dataset_01) == colnames(res_dataset_01))
# dim(res_dataset_01) # 1210 17309
# length(modules_dataset_01) # 17309

## Dataset 1 is ready!

########## Dataset 02
dir_expr_02 = "/pastel/projects/speakeasy_dlpfc/mostafavi/"
dir_mod_02 = "/pastel/projects/speakeasy_dlpfc/mostafavi/"

dat = read_rds(paste0(dir_expr_02, "DLPFC_gene_sara_v1.rds"))
exp_mat = assay(dat) %>% as.data.frame()
sample_meta = colData(dat) %>% as.data.frame()
gene_meta = elementMetadata(dat) %>% as.data.frame()

colnames(exp_mat) = gsub("(.*)\\:(.*)","\\1",colnames(exp_mat)) # Fix projIds

# Match expression with genes with modules 
k_dataset_02 = read.table(paste0(dir_mod_02, "mRNA_annotation.txt"), header = F, stringsAsFactors = F)
colnames(k_dataset_02) = c("symbol", "cluster", "ensembl")
# length(k_dataset_02$symbol[k_dataset_02$cluster == 109]) # 390 genes
# dim(k_dataset_02) 
# dim(exp_mat) 
# Rownames in exp_mat has ensembl ID version
# sum(duplicated(gsub("(.*)\\.(.*)","\\1",rownames(exp_mat)))) # No duplicates
rownames(exp_mat) = gsub("(.*)\\.(.*)","\\1",rownames(exp_mat))

genes_in_common_testset = intersect(rownames(exp_mat),k_dataset_02$ensembl)
# length(genes_in_common_testset)
k_dataset_02 = k_dataset_02[match(genes_in_common_testset,k_dataset_02$ensembl),]
exp_mat = exp_mat[match(genes_in_common_testset,rownames(exp_mat)),]
# all(rownames(exp_mat)==k_dataset_02$ensembl)
# dim(k_dataset_02)

# Mostafavi's matrix has ensembl. We need to match both 
res_dataset_02 = na.omit(as.data.frame(t(data.matrix(exp_mat))))
colnames(res_dataset_02) = k_dataset_02[match(colnames(res_dataset_02),k_dataset_02$ensembl),"ensembl"] # it can be symbol as in the ST data
# all(k_dataset_02$ensembl == colnames(res_dataset_02))
modules_dataset_02 = setNames(k_dataset_02$cluster, k_dataset_02$ensembl)
# all(names(modules_dataset_02) == colnames(res_dataset_02))

# dim(res_dataset_02) # 508 13412
# length(modules_dataset_02) # 13412
# Mostafavi's data is ready!
### Now data is ready to run
## Input: 
# res_dataset_01: expression matrix from dataset 01 (rows are samples, columns are genes) 
# modules_dataset_01: modules from dataset 01 (named array with module labels matching columns from res_dataset_01)
# prefix_dataset_01: Some ID to add to module lables (e.g. MF_)
# res_dataset_02: expression matrix from dataset 02 (rows are samples, columns are genes) 
# modules_dataset_02: modules from dataset 02 (named array with module labels matching columns from res_dataset_02)
# prefix_dataset_02: Some ID to add to module lables (e.g. MF_)

prefix_dataset_01 = "MF_"
prefix_dataset_02 = "Sara_M"
# Add module prefix
modules_dataset_01 = paste0(prefix_dataset_01, modules_dataset_01)
modules_dataset_02 = paste0(prefix_dataset_02, modules_dataset_02)

setLabels = c("dataset_01", "dataset_02")
multiExpr = list(dataset_01 = list(data = res_dataset_01), dataset_02 = list(data = res_dataset_02))
multiColor = list(dataset_01 = modules_dataset_01, dataset_02 = modules_dataset_02)

# Here comes the calculation of module preservation, it takes a while (will test both directions)
enableWGCNAThreads(nThreads = 8)
system.time( {
  mp = modulePreservation(multiExpr, multiColor, dataIsExpr = T,
                          maxModuleSize = 20000,
                          referenceNetworks = c(1,2), # Will first test set 1 as reference, then set 2 as reference.
                          nPermutations = 200, # Default = 200
                          randomSeed = 2023,
                          quickCor = 1, # 0 is suppose to be more precise but we get high Zsummary like 200 instead of 20
                          verbose = 3,
                          parallelCalculation = T)
} )

####################
ref = 1 # MF
test = 2 # Mostafavi

statsObs_12 = cbind(Refence = "MF", Test = "Mostafavi", mp$quality$observed[[ref]][[test]][,-1], mp$preservation$observed[[ref]][[test]][,-1])
statsObs_12$module_id = rownames(statsObs_12)

statsZ_12 = cbind(mp$quality$Z[[ref]][[test]], mp$preservation$Z[[ref]][[test]][,-1])
statsZ_12$module_id = rownames(statsZ_12)

res_df_12 = statsObs_12 %>% dplyr::left_join(statsZ_12, by = c("module_id"))

####################
ref = 2 # Mostafavi
test = 1 # MF

statsObs_21 = cbind(Refence = "Mostafavi", Test = "MF", mp$quality$observed[[ref]][[test]][,-1], mp$preservation$observed[[ref]][[test]][,-1])
statsObs_21$module_id = rownames(statsObs_21)

statsZ_21 = cbind(mp$quality$Z[[ref]][[test]], mp$preservation$Z[[ref]][[test]][,-1])
statsZ_21$module_id = rownames(statsZ_21)

res_df_21 = statsObs_21 %>% dplyr::left_join(statsZ_21, by = c("module_id"))

####################
res_df = bind_rows(res_df_12, res_df_21)

# Save the results
save(mp,res_df,res_df_12,res_df_21, file = paste0(work_dir, "mp_MF_mostafavi.RData"))
load(paste0(work_dir, "mp_MF_mostafavi.RData"))

createDT(res_df)

Question: Are the modules from MF preserved in Mostafavi network?

Reference: Dataset 01 (MF) Test: Dataset 02 (Mostafavi)

# Compare preservation to qualit (generates a table)
p_medianrank <- ggplot(res_df_12, aes(x = moduleSize, y = medianRank.pres )) +
  geom_point(shape = 21) +
  geom_label_repel(aes(label = module_id)) +
  scale_y_reverse() +
  labs(y = "Preservation Median rank", x = "Module size", title = "Preservation Median rank") +
  theme_classic()

p_zsummary <- ggplot(res_df_12, aes(x = moduleSize, y = Zsummary.pres )) + 
  geom_hline(yintercept = c(2,10), linetype = "dashed") +
  geom_point(shape = 21) +
  geom_label_repel(aes(label = module_id)) +
  labs(y = "Preservation Zsummary", x = "Module size", title = "Preservation Zsummary") +
  theme_classic()

# Plot the REFERENCE 
ggarrange(p_medianrank, p_zsummary, ncol=2)

Question: Are the modules from Mostafavi preserved at the MF network?

Reference: Dataset 02 Test: Dataset 01

# Compare preservation to qualit (generates a table)
p_medianrank <- ggplot(res_df_21, aes(x = moduleSize, y = medianRank.pres )) +
  geom_point(shape = 21) +
  geom_label_repel(aes(label = module_id)) +
  scale_y_reverse() +
  labs(y = "Preservation Median rank", x = "Module size", title = "Preservation Median rank") +
  theme_classic()

p_zsummary <- ggplot(res_df_21, aes(x = moduleSize, y = Zsummary.pres )) + 
  geom_hline(yintercept = c(2,10), linetype = "dashed") +
  geom_point(shape = 21) +
  geom_label_repel(aes(label = module_id), max.overlaps = 20) +
  labs(y = "Preservation Zsummary", x = "Module size", title = "Preservation Zsummary") +
  theme_classic()

# Plot the REFERENCE 
ggarrange(p_medianrank, p_zsummary, ncol=2)

Session info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] SummarizedExperiment_1.24.0 Biobase_2.54.0             
##  [3] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
##  [5] IRanges_2.28.0              S4Vectors_0.32.3           
##  [7] BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
##  [9] matrixStats_0.61.0          ggeasy_0.1.3               
## [11] ggpubr_0.4.0                ggrepel_0.9.1              
## [13] WGCNA_1.70-3                fastcluster_1.2.3          
## [15] dynamicTreeCut_1.63-1       forcats_0.5.1              
## [17] stringr_1.4.0               dplyr_1.0.8                
## [19] purrr_0.3.4                 readr_2.1.2                
## [21] tidyr_1.2.0                 tibble_3.1.6               
## [23] ggplot2_3.3.5               tidyverse_1.3.1            
## [25] limma_3.50.1               
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-3       ggsignif_0.6.3         ellipsis_0.3.2        
##   [4] htmlTable_2.4.0        XVector_0.34.0         base64enc_0.1-3       
##   [7] fs_1.5.2               rstudioapi_0.13        farver_2.1.0          
##  [10] DT_0.20                bit64_4.0.5            AnnotationDbi_1.56.2  
##  [13] fansi_1.0.2            lubridate_1.8.0        xml2_1.3.3            
##  [16] codetools_0.2-18       splines_4.1.2          doParallel_1.0.17     
##  [19] cachem_1.0.6           impute_1.68.0          knitr_1.37            
##  [22] Formula_1.2-4          jsonlite_1.7.3         broom_0.7.12          
##  [25] cluster_2.1.2          GO.db_3.14.0           dbplyr_2.1.1          
##  [28] png_0.1-7              compiler_4.1.2         httr_1.4.2            
##  [31] backports_1.4.1        assertthat_0.2.1       Matrix_1.5-1          
##  [34] fastmap_1.1.0          cli_3.2.0              htmltools_0.5.2       
##  [37] tools_4.1.2            gtable_0.3.0           glue_1.6.1            
##  [40] GenomeInfoDbData_1.2.7 Rcpp_1.0.8             carData_3.0-5         
##  [43] cellranger_1.1.0       jquerylib_0.1.4        vctrs_0.3.8           
##  [46] Biostrings_2.62.0      preprocessCore_1.56.0  crosstalk_1.2.0       
##  [49] iterators_1.0.14       xfun_0.29              rvest_1.0.2           
##  [52] lifecycle_1.0.1        rstatix_0.7.0          zlibbioc_1.40.0       
##  [55] scales_1.1.1           hms_1.1.1              parallel_4.1.2        
##  [58] RColorBrewer_1.1-2     yaml_2.3.5             memoise_2.0.1         
##  [61] gridExtra_2.3          sass_0.4.0             rpart_4.1-15          
##  [64] latticeExtra_0.6-29    stringi_1.7.6          RSQLite_2.2.10        
##  [67] highr_0.9              foreach_1.5.2          checkmate_2.0.0       
##  [70] rlang_1.0.1            pkgconfig_2.0.3        bitops_1.0-7          
##  [73] evaluate_0.15          lattice_0.20-45        labeling_0.4.2        
##  [76] htmlwidgets_1.5.4      cowplot_1.1.1          bit_4.0.4             
##  [79] tidyselect_1.1.2       magrittr_2.0.2         R6_2.5.1              
##  [82] generics_0.1.2         Hmisc_4.6-0            DelayedArray_0.20.0   
##  [85] DBI_1.1.2              pillar_1.7.0           haven_2.4.3           
##  [88] foreign_0.8-81         withr_2.4.3            abind_1.4-5           
##  [91] survival_3.2-13        KEGGREST_1.34.0        RCurl_1.98-1.6        
##  [94] nnet_7.3-16            car_3.0-12             modelr_0.1.8          
##  [97] crayon_1.5.0           utf8_1.2.2             tzdb_0.2.0            
## [100] rmarkdown_2.11         jpeg_0.1-9             grid_4.1.2            
## [103] readxl_1.3.1           data.table_1.14.2      blob_1.2.2            
## [106] reprex_2.0.1           digest_0.6.29          munsell_0.5.0         
## [109] bslib_0.3.1